
### -------------------------------------------------- 
### ---- General Set Up ---- 
### -------------------------------------------------- 

### Clear global environment
rm(list=ls()) 


### Libraries:
library(pacman)
p_load(tidyverse, foreign, readstata13, readr,
       magrittr, here,
       readxl)

### File Location (R project only)
here::i_am("Survey Only Match/101_match_tfa_to_state_address.R")
file_path <- here()


### -------------------------------------------------- 
### ---- Download TFA Data ----
### -------------------------------------------------- 

### Download Survey/Application Data
tfa_dat <- read.dta13(paste0(file_path,"/Data/tfa_data.dta"))


### -------------------------------------------------- 
### ---- Download Address Data ----
### -------------------------------------------------- 

tfa_address_file_path <- paste0(file_path,"/Data/2007.2015 Apps Submitted with Address.csv")
tfa_address_only <- read_csv(tfa_address_file_path, col_names = TRUE, na = c("", "NA")) %>%
  rename(personid = Person_Id) %>%
  mutate(personid = as.character(personid)) %>%
  mutate(match_address = "Matched to Address") %>%
  mutate(state_ca = Current_Address.State) %>%
  mutate(state_pa = Permanent_Address.State)

# And additional address file to correct problem with 2009 cohort addresses
  # merge in new data source of addresses and select the earliest address available

# Read New PA Data
new_tfa_address <- read_xlsx(paste0(file_path,"/Data/PersonAddress_2009CMs.xlsx"), 
                                 col_names = TRUE, na = c("", "NA")) %>%
  rename(personid = personkey) %>%
  mutate(personid = as.character(personid))

# Filter for oldest current address entries 
new_tfa_address %<>%
  filter(TYPE=="CURRENT") %>%
  filter(!is.na(STATE)) %>% 
  group_by(personid) %>% 
  filter(UPDATETIME == min(UPDATETIME)) %>%
  distinct() %>%
  ungroup()

# Rename ca vars
new_tfa_address %<>% 
  select(personid, ADDRESS1, ADDRESS2, CITY, STATE, ZIP, COUNTRY) %>%
  rename(Current_Address.Address1_new = ADDRESS1,
         Current_Address.Address2_new = ADDRESS2,
         Current_Address.City_new = CITY,
         Current_Address.Country_new = COUNTRY,
         Current_Address.State_new = STATE,
         Current_Address.Zip_new = ZIP) %>%
  mutate(state_ca_new = Current_Address.State_new)

# Merge to get app year and filter for 2009 cohort only 
new_tfa_address_ca <- tfa_dat %>%
  select(personid, appyear) %>% 
  left_join(x = new_tfa_address,
            y = .,
            by = "personid") %>%
  filter(appyear==2009) %>%
  select(-appyear)
rm(new_tfa_address)
  
### Merge w. other address data
tfa_address_only %<>% left_join(x = .,
                                y = new_tfa_address_ca,
                                by = "personid")
rm(new_tfa_address_ca)


### -------------------------------------------------- 
### ---- Match Address Data ----
### -------------------------------------------------- 

### Merge survey/application data with address data
tfa_dat_address <- left_join(x = tfa_dat,
                             y = tfa_address_only,
                             by = "personid") 

### Add new 2009 pa data
  ### replace address with oldest address from 2009 file if in 2009 app cohort
tfa_dat_address %<>% 
  mutate(state_ca = ifelse(appyear==2009, state_ca_new, state_ca),
         Current_Address.Address1 = ifelse(appyear==2009, Current_Address.Address1_new, Current_Address.Address1),
         Current_Address.Address2 = ifelse(appyear==2009, Current_Address.Address2_new, Current_Address.Address2),
         Current_Address.City = ifelse(appyear==2009, Current_Address.City_new, Current_Address.City),
         Current_Address.Country = ifelse(appyear==2009, Current_Address.Country_new, Current_Address.Country),
         Current_Address.State = ifelse(appyear==2009, Current_Address.State_new, Current_Address.State),
         Current_Address.Zip = ifelse(appyear==2009, Current_Address.Zip_new, Current_Address.Zip)
         ) 
  
### --- 
### NOTE:
### - Matched 115,429 to address file (95.9%)
### - Failed to match 4,988 to address file (4.1%)
### --- 

### -------------------------------------------------- 
### ---- Clean Address Data ----
### -------------------------------------------------- 


### Loop through List of States in TFA Address dataset (CA)
## Problem: state could be written in multiple ways (IN, in, Indiana)
## Solution: create a key that indicates, for each unique state entered
##  in the application data, the standardized state abbreviation

#list unique states for current addresses
unique_state_ca <- tfa_dat_address %>%
  group_by(state_ca) %>%
  summarise(n=n()) %>% print()

#list unique states for permanent addresses
unique_state_pa <- tfa_dat_address %>%
  group_by(state_pa) %>%
  summarise(n=n()) %>% print()



  #vector of NAs the length of the number of unique
  #state names listed as current addresses
state_ca_clean <- rep(NA, nrow(unique_state_ca[,1]))
state_ca_clean <- as.character(state_ca_clean)
unique_state_ca <- cbind(unique_state_ca,state_ca_clean) 

# Search for State Abbreviations 
data(state) #load dataset with state abbreviations
state.abb <- rbind(state.abb, "DC")
#for each state abbreviation,
for(name in seq_along(state.abb)){
  #for each unique value of current address state,
  for(i in seq_along(unique_state_ca[,"state_ca"])){
    #if the abbreviation is the same as the state listed,
    if(grepl(paste0("^",state.abb[name],"$"), 
             unique_state_ca[i,"state_ca"], 
             ignore.case=TRUE))
      #replace the item in the same position with
      #the abbreviation for the state
      state_ca_clean[i] <- state.abb[name]
  }
}
# Search for State Names 
  #repeat, but this time checking for state names,
  #not state abbreviations
data(state)
for(name in seq_along(state.name)){
  for(i in seq_along(unique_state_ca[,"state_ca"])){
    if(grepl(paste0("^",state.name[name],"$"), 
             unique_state_ca[i,"state_ca"], 
             ignore.case=TRUE))
      state_ca_clean[i] <- state.abb[name]
  }
}

unique_state_ca <- cbind(unique_state_ca,state_ca_clean)

colnames(unique_state_ca) <- c("state_ca","n","DELETE","state_ca_clean")
unique_state_ca %<>% select(state_ca, state_ca_clean)

# result: a dataframe with two columns:
  # a state name as listed in the address 
  # a standardized two-letter abbreviation of a state, which we think the text in column 1 refers to


# REPEAT For permanent addresses

### Loop through List of States in TFA Address dataset (PA)
state_pa_clean <- rep(NA, nrow(unique_state_pa[,1]))
state_pa_clean <- as.character(state_pa_clean)
unique_state_pa <- cbind(unique_state_pa,state_pa_clean) 

# Search for State Abbreviations 
data(state)
state.abb <- rbind(state.abb, "DC")
for(name in seq_along(state.abb)){
  for(i in seq_along(unique_state_pa[,"state_pa"])){
    if(grepl(paste0("^",state.abb[name],"$"), 
             unique_state_pa[i,"state_pa"], 
             ignore.case=TRUE))
      state_pa_clean[i] <- state.abb[name]
  }
}
# Search for State Names 
data(state)
for(name in seq_along(state.name)){
  for(i in seq_along(unique_state_pa[,"state_pa"])){
    if(grepl(paste0("^",state.name[name],"$"), 
             unique_state_pa[i,"state_pa"], 
             ignore.case=TRUE))
      state_pa_clean[i] <- state.abb[name]
  }
}

unique_state_pa <- cbind(unique_state_pa,state_pa_clean)

colnames(unique_state_pa) <- c("state_pa","n","DELETE","state_pa_clean")
unique_state_pa %<>% select(state_pa, state_pa_clean)


### -------------------------------------------------- 
### ---- Merge Cleaned Address Data ----
### -------------------------------------------------- 

#merge in key column with standardized state abbvs
tfa_dat_address <- left_join(x = tfa_dat_address,
                             y = unique_state_ca,
                             by = "state_ca") 

tfa_dat_address <- left_join(x = tfa_dat_address,
                             y = unique_state_pa,
                             by = "state_pa") 

# Replace old version of pa and ca state vars
tfa_dat_address %<>%
  mutate(state_ca = state_ca_clean,
         state_pa = state_pa_clean)

### --- 
### NOTE(s): Current Address Data
### - 475 Unique state responses 
### - 11.2% of respondent addresses not matched to a state 
### 
### NOTE(s): Permanent Address Data
### - 220 Unique state responses 
### - 31.2% of respondent addresses not matched to a state 
### --- 


### -------------------------------------------------- 
### ---- Saved Matched Data ---- 
### -------------------------------------------------- 

tfa_dat_address_FINAL <- tfa_dat_address

save(tfa_dat_address_FINAL, 
     file=paste0(file_path,"/Survey Only Match/Temp_Data/tfa_to_state_address.RData"))




